#include "cuda_runtime.h"
#include "device_launch_parameters.h"

__global__  void multiplyKernel_8_noPrefetch_granularity_1x1_noLoopUnrolling(
	float *result, 
	const float *matrix1, 
	const float *matrix2,
	const int height,
	const int width, 
	const int length);

__global__  void multiplyKernel_8_noPrefetch_granularity_1x1_2LoopUnrolling(
	float *result, 
	const float *matrix1, 
	const float *matrix2,
	const int height,
	const int width, 
	const int length);

__global__  void multiplyKernel_8_noPrefetch_granularity_1x1_4LoopUnrolling(
	float *result, 
	const float *matrix1, 
	const float *matrix2,
	const int height,
	const int width, 
	const int length);

__global__  void multiplyKernel_8_noPrefetch_granularity_1x1_fullLoopUnrolling(
	float *result, 
	const float *matrix1, 
	const float *matrix2,
	const int height,
	const int width, 
	const int length);

__global__  void multiplyKernel_8_noPrefetch_granularity_1x2_noLoopUnrolling(
	float *result, 
	const float *matrix1, 
	const float *matrix2,
	const int height,
	const int width, 
	const int length);

__global__  void multiplyKernel_8_noPrefetch_granularity_1x2_2LoopUnrolling(
	float *result, 
	const float *matrix1, 
	const float *matrix2,
	const int height,
	const int width, 
	const int length);

__global__  void multiplyKernel_8_noPrefetch_granularity_1x2_4LoopUnrolling(
	float *result, 
	const float *matrix1, 
	const float *matrix2,
	const int height,
	const int width, 
	const int length);

__global__  void multiplyKernel_8_noPrefetch_granularity_1x2_fullLoopUnrolling(
	float *result, 
	const float *matrix1, 
	const float *matrix2,
	const int height,
	const int width, 
	const int length);

__global__  void multiplyKernel_8_noPrefetch_granularity_1x4_noLoopUnrolling(
	float *result, 
	const float *matrix1, 
	const float *matrix2,
	const int height,
	const int width, 
	const int length);

__global__  void multiplyKernel_8_noPrefetch_granularity_1x4_2LoopUnrolling(
	float *result, 
	const float *matrix1, 
	const float *matrix2,
	const int height,
	const int width, 
	const int length);

__global__  void multiplyKernel_8_noPrefetch_granularity_1x4_4LoopUnrolling(
	float *result, 
	const float *matrix1, 
	const float *matrix2,
	const int height,
	const int width, 
	const int length);

__global__  void multiplyKernel_8_noPrefetch_granularity_1x4_fullLoopUnrolling(
	float *result, 
	const float *matrix1, 
	const float *matrix2,
	const int height,
	const int width, 
	const int length);

__global__  void multiplyKernel_8_prefetch_granularity_1x1_noLoopUnrolling(
	float *result, 
	const float *matrix1, 
	const float *matrix2,
	const int height,
	const int width, 
	const int length);

__global__  void multiplyKernel_8_prefetch_granularity_1x1_2LoopUnrolling(
	float *result, 
	const float *matrix1, 
	const float *matrix2,
	const int height,
	const int width, 
	const int length);

__global__  void multiplyKernel_8_prefetch_granularity_1x1_4LoopUnrolling(
	float *result, 
	const float *matrix1, 
	const float *matrix2,
	const int height,
	const int width, 
	const int length);

__global__  void multiplyKernel_8_prefetch_granularity_1x1_fullLoopUnrolling(
	float *result, 
	const float *matrix1, 
	const float *matrix2,
	const int height,
	const int width, 
	const int length);

__global__  void multiplyKernel_8_prefetch_granularity_1x2_noLoopUnrolling(
	float *result, 
	const float *matrix1, 
	const float *matrix2,
	const int height,
	const int width, 
	const int length);

__global__  void multiplyKernel_8_prefetch_granularity_1x2_2LoopUnrolling(
	float *result, 
	const float *matrix1, 
	const float *matrix2,
	const int height,
	const int width, 
	const int length);

__global__  void multiplyKernel_8_prefetch_granularity_1x2_4LoopUnrolling(
	float *result, 
	const float *matrix1, 
	const float *matrix2,
	const int height,
	const int width, 
	const int length);

__global__  void multiplyKernel_8_prefetch_granularity_1x2_fullLoopUnrolling(
	float *result, 
	const float *matrix1, 
	const float *matrix2,
	const int height,
	const int width, 
	const int length);

__global__  void multiplyKernel_8_prefetch_granularity_1x4_noLoopUnrolling(
	float *result, 
	const float *matrix1, 
	const float *matrix2,
	const int height,
	const int width, 
	const int length);

__global__  void multiplyKernel_8_prefetch_granularity_1x4_2LoopUnrolling(
	float *result, 
	const float *matrix1, 
	const float *matrix2,
	const int height,
	const int width, 
	const int length);

__global__  void multiplyKernel_8_prefetch_granularity_1x4_4LoopUnrolling(
	float *result, 
	const float *matrix1, 
	const float *matrix2,
	const int height,
	const int width, 
	const int length);

__global__  void multiplyKernel_8_prefetch_granularity_1x4_fullLoopUnrolling(
	float *result, 
	const float *matrix1, 
	const float *matrix2,
	const int height,
	const int width, 
	const int length);

__global__  void multiplyKernel_16_noPrefetch_granularity_1x1_noLoopUnrolling(
	float *result, 
	const float *matrix1, 
	const float *matrix2,
	const int height,
	const int width, 
	const int length);

__global__  void multiplyKernel_16_noPrefetch_granularity_1x1_2LoopUnrolling(
	float *result, 
	const float *matrix1, 
	const float *matrix2,
	const int height,
	const int width, 
	const int length);

__global__  void multiplyKernel_16_noPrefetch_granularity_1x1_4LoopUnrolling(
	float *result, 
	const float *matrix1, 
	const float *matrix2,
	const int height,
	const int width, 
	const int length);

__global__  void multiplyKernel_16_noPrefetch_granularity_1x1_fullLoopUnrolling(
	float *result, 
	const float *matrix1, 
	const float *matrix2,
	const int height,
	const int width, 
	const int length);

__global__  void multiplyKernel_16_noPrefetch_granularity_1x2_noLoopUnrolling(
	float *result, 
	const float *matrix1, 
	const float *matrix2,
	const int height,
	const int width, 
	const int length);

__global__  void multiplyKernel_16_noPrefetch_granularity_1x2_2LoopUnrolling(
	float *result, 
	const float *matrix1, 
	const float *matrix2,
	const int height,
	const int width, 
	const int length);

__global__  void multiplyKernel_16_noPrefetch_granularity_1x2_4LoopUnrolling(
	float *result, 
	const float *matrix1, 
	const float *matrix2,
	const int height,
	const int width, 
	const int length);

__global__  void multiplyKernel_16_noPrefetch_granularity_1x2_fullLoopUnrolling(
	float *result, 
	const float *matrix1, 
	const float *matrix2,
	const int height,
	const int width, 
	const int length);

__global__  void multiplyKernel_16_noPrefetch_granularity_1x4_noLoopUnrolling(
	float *result, 
	const float *matrix1, 
	const float *matrix2,
	const int height,
	const int width, 
	const int length);

__global__  void multiplyKernel_16_noPrefetch_granularity_1x4_2LoopUnrolling(
	float *result, 
	const float *matrix1, 
	const float *matrix2,
	const int height,
	const int width, 
	const int length);

__global__  void multiplyKernel_16_noPrefetch_granularity_1x4_4LoopUnrolling(
	float *result, 
	const float *matrix1, 
	const float *matrix2,
	const int height,
	const int width, 
	const int length);

__global__  void multiplyKernel_16_noPrefetch_granularity_1x4_fullLoopUnrolling(
	float *result, 
	const float *matrix1, 
	const float *matrix2,
	const int height,
	const int width, 
	const int length);

__global__  void multiplyKernel_16_prefetch_granularity_1x1_noLoopUnrolling(
	float *result, 
	const float *matrix1, 
	const float *matrix2,
	const int height,
	const int width, 
	const int length);

__global__  void multiplyKernel_16_prefetch_granularity_1x1_2LoopUnrolling(
	float *result, 
	const float *matrix1, 
	const float *matrix2,
	const int height,
	const int width, 
	const int length);

__global__  void multiplyKernel_16_prefetch_granularity_1x1_4LoopUnrolling(
	float *result, 
	const float *matrix1, 
	const float *matrix2,
	const int height,
	const int width, 
	const int length);

__global__  void multiplyKernel_16_prefetch_granularity_1x1_fullLoopUnrolling(
	float *result, 
	const float *matrix1, 
	const float *matrix2,
	const int height,
	const int width, 
	const int length);

__global__  void multiplyKernel_16_prefetch_granularity_1x2_noLoopUnrolling(
	float *result, 
	const float *matrix1, 
	const float *matrix2,
	const int height,
	const int width, 
	const int length);

__global__  void multiplyKernel_16_prefetch_granularity_1x2_2LoopUnrolling(
	float *result, 
	const float *matrix1, 
	const float *matrix2,
	const int height,
	const int width, 
	const int length);

__global__  void multiplyKernel_16_prefetch_granularity_1x2_4LoopUnrolling(
	float *result, 
	const float *matrix1, 
	const float *matrix2,
	const int height,
	const int width, 
	const int length);

__global__  void multiplyKernel_16_prefetch_granularity_1x2_fullLoopUnrolling(
	float *result, 
	const float *matrix1, 
	const float *matrix2,
	const int height,
	const int width, 
	const int length);

__global__  void multiplyKernel_16_prefetch_granularity_1x4_noLoopUnrolling(
	float *result, 
	const float *matrix1, 
	const float *matrix2,
	const int height,
	const int width, 
	const int length);

__global__  void multiplyKernel_16_prefetch_granularity_1x4_2LoopUnrolling(
	float *result, 
	const float *matrix1, 
	const float *matrix2,
	const int height,
	const int width, 
	const int length);

__global__  void multiplyKernel_16_prefetch_granularity_1x4_4LoopUnrolling(
	float *result, 
	const float *matrix1, 
	const float *matrix2,
	const int height,
	const int width, 
	const int length);

__global__  void multiplyKernel_16_prefetch_granularity_1x4_fullLoopUnrolling(
	float *result, 
	const float *matrix1, 
	const float *matrix2,
	const int height,
	const int width, 
	const int length);
